In [1]:
import pandas as pd
import sklearn.neural_network as sklnn
import sklearn.tree as skltr
In [2]:
shots = pd.read_csv('./shot_logs.csv')
In [3]:
shots.isnull().any(axis=1).sum()
Out[3]:
In [4]:
#Lets get rid off null values
shots.fillna(0, inplace=True)
shots.head()
Out[4]:
In [5]:
# Let's drop some columns that don't make sense
treated = shots.drop(['GAME_ID', 'MATCHUP', 'LOCATION', 'W', 'FINAL_MARGIN', 'FGM', 'PTS', 'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID', 'player_name', 'player_id'], axis=1)
treated.head()
Out[5]:
In [6]:
# Game Clock is a string, we should convert it to seconds
def clockToSeconds(clock):
mins, secs = clock.split(':')
return int(mins)*60 + int(secs)
treated['GAME_CLOCK'] = treated['GAME_CLOCK'].apply(clockToSeconds)
treated.head()
Out[6]:
In [7]:
# Now lets shuffle our data and then split it into training and testing
# And extract the results for each sample
train = treated.sample(frac=0.7)
test = treated.drop(train.index)
train_result = train['SHOT_RESULT'].replace('made', 0).replace('missed', 1)
train.drop('SHOT_RESULT',axis=1, inplace=True)
test_result = test['SHOT_RESULT'].replace('made', 0).replace('missed', 1)
test.drop('SHOT_RESULT',axis=1, inplace=True)
In [13]:
classifier = sklnn.MLPClassifier(activation='relu', hidden_layer_sizes=(15, 10), max_iter=5000, warm_start=True)
classifier.fit(train, train_result)
prediction = classifier.predict(test)
print("Accuracy Using a (15,10) MLP: %.2f" % ((prediction==test_result).sum()/len(prediction)))
In [22]:
tree = skltr.DecisionTreeClassifier(criterion='entropy', max_features=7)
tree.fit(train, train_result)
prediction = tree.predict(test)
print("Accuracy using Decision Tree: %.2f" % ((prediction==test_result).sum()/len(prediction)))
In [21]:
print("Random guessing is around: %.2f" % test_result.mean())
We can see that a Decision Tree classfier using entropy as the split criterion is not better than Random Guessing and a MLP classifier improves the prediction accuracy just a bit
In [ ]: